Take-home_Ex02

Author

Zhang Chenbin

DataVis Makeover

Getting Started

Loading the required packages

pacman::p_load(dplyr, purrr, readr, ggiraph,
               ggplot2, lubridate, ggrepel,
               patchwork, ggthemes, hrbrthemes, tidyverse, plotly, readr)

Loading the data into the R environment

data1 = read_csv("data/ResidentialTransaction20240308160536.csv")
data2 = read_csv("data/ResidentialTransaction20240308160736.csv")
data3 = read_csv("data/ResidentialTransaction20240308161009.csv")
data4 = read_csv("data/ResidentialTransaction20240308161109.csv")
data5 = read_csv("data/ResidentialTransaction20240414220633.csv")
combined_data <- bind_rows(data1, data2, data3, data4, data5)

glimpse(combined_data)
Rows: 26,806
Columns: 21
$ `Project Name`                <chr> "THE REEF AT KING'S DOCK", "URBAN TREASU…
$ `Transacted Price ($)`        <dbl> 2317000, 1823500, 1421112, 1258112, 1280…
$ `Area (SQFT)`                 <dbl> 882.65, 882.65, 1076.40, 1033.34, 871.88…
$ `Unit Price ($ PSF)`          <dbl> 2625, 2066, 1320, 1218, 1468, 1767, 1095…
$ `Sale Date`                   <chr> "01 Jan 2023", "02 Jan 2023", "02 Jan 20…
$ Address                       <chr> "12 HARBOURFRONT AVENUE #05-32", "205 JA…
$ `Type of Sale`                <chr> "New Sale", "New Sale", "New Sale", "New…
$ `Type of Area`                <chr> "Strata", "Strata", "Strata", "Strata", …
$ `Area (SQM)`                  <dbl> 82.0, 82.0, 100.0, 96.0, 81.0, 308.7, 42…
$ `Unit Price ($ PSM)`          <dbl> 28256, 22238, 14211, 13105, 15802, 19015…
$ `Nett Price($)`               <chr> "-", "-", "-", "-", "-", "-", "-", "-", …
$ `Property Type`               <chr> "Condominium", "Condominium", "Executive…
$ `Number of Units`             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ Tenure                        <chr> "99 yrs from 12/01/2021", "Freehold", "9…
$ `Completion Date`             <chr> "Uncompleted", "Uncompleted", "Uncomplet…
$ `Purchaser Address Indicator` <chr> "HDB", "Private", "HDB", "HDB", "HDB", "…
$ `Postal Code`                 <chr> "097996", "419535", "269343", "269294", …
$ `Postal District`             <chr> "04", "14", "27", "27", "28", "19", "10"…
$ `Postal Sector`               <chr> "09", "41", "26", "26", "79", "54", "27"…
$ `Planning Region`             <chr> "Central Region", "East Region", "North …
$ `Planning Area`               <chr> "Bukit Merah", "Bedok", "Yishun", "Yishu…

Preparing the data

# Perform the mutations step by step to avoid complexity and potential errors
data_cleaned <- combined_data %>%
  mutate(`Sale Date` = dmy(`Sale Date`)) %>%
  mutate(`Area (SQM)` = as.numeric(gsub(",", "", `Area (SQM)`))) %>%
  mutate(`Unit Price ($ PSM)` = as.numeric(gsub("\\$", "", gsub(",", "", `Unit Price ($ PSM)`)))) %>%
  mutate(`Area (SQM)` = ifelse(is.na(`Area (SQM)`), 0, `Area (SQM)`)) %>%
  mutate(Area_Category = cut(
    `Area (SQM)`,
    breaks = c(0, 100, 200, 300, 400, Inf),
    labels = c("<100", "100-200", "200-300", "300-400", ">400"),
    include.lowest = TRUE
  ))

# Adding Quarter separately
data_cleaned <- data_cleaned %>%
  mutate(Quarter = ifelse(month(`Sale Date`) %in% c(1, 2, 3), "Q1",
                          ifelse(month(`Sale Date`) %in% c(4, 5, 6), "Q2",
                                 ifelse(month(`Sale Date`) %in% c(7, 8, 9), "Q3", "Q4"))))

# Output the structure of data_cleaned to confirm changes
str(data_cleaned)
tibble [26,806 × 23] (S3: tbl_df/tbl/data.frame)
 $ Project Name               : chr [1:26806] "THE REEF AT KING'S DOCK" "URBAN TREASURES" "NORTH GAIA" "NORTH GAIA" ...
 $ Transacted Price ($)       : num [1:26806] 2317000 1823500 1421112 1258112 1280000 ...
 $ Area (SQFT)                : num [1:26806] 883 883 1076 1033 872 ...
 $ Unit Price ($ PSF)         : num [1:26806] 2625 2066 1320 1218 1468 ...
 $ Sale Date                  : Date[1:26806], format: "2023-01-01" "2023-01-02" ...
 $ Address                    : chr [1:26806] "12 HARBOURFRONT AVENUE #05-32" "205 JALAN EUNOS #08-02" "29 YISHUN CLOSE #08-10" "45 YISHUN CLOSE #07-42" ...
 $ Type of Sale               : chr [1:26806] "New Sale" "New Sale" "New Sale" "New Sale" ...
 $ Type of Area               : chr [1:26806] "Strata" "Strata" "Strata" "Strata" ...
 $ Area (SQM)                 : num [1:26806] 82 82 100 96 81 ...
 $ Unit Price ($ PSM)         : num [1:26806] 28256 22238 14211 13105 15802 ...
 $ Nett Price($)              : chr [1:26806] "-" "-" "-" "-" ...
 $ Property Type              : chr [1:26806] "Condominium" "Condominium" "Executive Condominium" "Executive Condominium" ...
 $ Number of Units            : num [1:26806] 1 1 1 1 1 1 1 1 1 1 ...
 $ Tenure                     : chr [1:26806] "99 yrs from 12/01/2021" "Freehold" "99 yrs from 15/02/2021" "99 yrs from 15/02/2021" ...
 $ Completion Date            : chr [1:26806] "Uncompleted" "Uncompleted" "Uncompleted" "Uncompleted" ...
 $ Purchaser Address Indicator: chr [1:26806] "HDB" "Private" "HDB" "HDB" ...
 $ Postal Code                : chr [1:26806] "097996" "419535" "269343" "269294" ...
 $ Postal District            : chr [1:26806] "04" "14" "27" "27" ...
 $ Postal Sector              : chr [1:26806] "09" "41" "26" "26" ...
 $ Planning Region            : chr [1:26806] "Central Region" "East Region" "North Region" "North Region" ...
 $ Planning Area              : chr [1:26806] "Bukit Merah" "Bedok" "Yishun" "Yishun" ...
 $ Area_Category              : Factor w/ 5 levels "<100","100-200",..: 1 1 1 1 1 4 5 2 1 2 ...
 $ Quarter                    : chr [1:26806] "Q1" "Q1" "Q1" "Q1" ...
duplicate <- combined_data %>% 
  group_by_all() %>% 
  filter(n()>1) %>% 
  ungroup()
  
duplicate
# A tibble: 0 × 21
# ℹ 21 variables: Project Name <chr>, Transacted Price ($) <dbl>,
#   Area (SQFT) <dbl>, Unit Price ($ PSF) <dbl>, Sale Date <chr>,
#   Address <chr>, Type of Sale <chr>, Type of Area <chr>, Area (SQM) <dbl>,
#   Unit Price ($ PSM) <dbl>, Nett Price($) <chr>, Property Type <chr>,
#   Number of Units <dbl>, Tenure <chr>, Completion Date <chr>,
#   Purchaser Address Indicator <chr>, Postal Code <chr>,
#   Postal District <chr>, Postal Sector <chr>, Planning Region <chr>, …
F1 <- ggplot(combined_data, aes(x = `Property Type`)) + 
    geom_bar_interactive(aes(fill = `Planning Region`), position = "dodge") +  
    labs(x = "Property Type", y = "Frequency",
         title = "Frequency of Property Types by Planning Region") +
    facet_wrap(~ `Planning Region`, scales = "free") +
    theme_stata(base_size = 2.5)
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
List of 1
 $ axis.text.x:List of 11
  ..$ family       : NULL
  ..$ face         : NULL
  ..$ colour       : NULL
  ..$ size         : NULL
  ..$ hjust        : num 1
  ..$ vjust        : NULL
  ..$ angle        : num 45
  ..$ lineheight   : NULL
  ..$ margin       : NULL
  ..$ debug        : NULL
  ..$ inherit.blank: logi FALSE
  ..- attr(*, "class")= chr [1:2] "element_text" "element"
 - attr(*, "class")= chr [1:2] "theme" "gg"
 - attr(*, "complete")= logi FALSE
 - attr(*, "validate")= logi TRUE
    F1

DataViz Makeover

Step 1: time analysis

The absence of data from the previous year’s four quarters in the scatter plot analyzing the relationship between transacted prices and area poses certain challenges. Primarily, it limits the depth of market trend analysis over an extended period, potentially overlooking seasonal fluctuations and economic cycles that significantly influence real estate markets. Without this broader temporal context, the analysis might give undue emphasis to short-term variations or isolated events, potentially leading to a somewhat narrow perspective on market dynamics. Such a limitation curtails the capacity to observe and interpret long-term trends and the cyclical nature of the housing market, which are essential for comprehensive statistical analysis and forecasting.

Furthermore, the lack of previous year data restricts year-over-year comparative analysis, crucial for evaluating market condition changes and understanding the influence of external economic factors like interest rate shifts or broader economic changes. For investors and policymakers, the unavailability of a more extended data range may hinder informed strategic decision-making. It limits the development of predictive models that rely on historical data to accurately project future market behaviors, thereby subtly reducing the analysis’s overall utility in supporting nuanced business and investment decisions.

To address the challenges of a limited temporal analysis, I have enhanced the dataset by incorporating data from the previous year and categorizing it by quarters. This addition allows for a more nuanced exploration of the changes and trends over time. By analyzing the data on a quarterly basis, we can better understand the seasonal fluctuations and the impact of economic cycles on the real estate market.

P3_interactive <- plot_ly(data = data_cleaned, x = ~`Area (SQM)`, y = ~`Unit Price ($ PSM)`, type = 'scatter', mode = 'markers',
                          hoverinfo = 'text',
                          transforms = list(
                            list(
                              type = 'filter',
                              target = ~Quarter,
                              operation = '=',
                              value = unique(data_cleaned$Quarter)[1]  
                            )
                          ),
                          text = ~paste("Price: ", `Unit Price ($ PSM)`, "$/sqm<br>Area: ", `Area (SQM)`, "sqm<br>Quarter: ", Quarter)) %>%
  layout(title = 'Interactive Scatter Plot of Unit Price vs. Area by Category',
         xaxis = list(title = 'Area (SQM)'),
         yaxis = list(title = 'Unit Price ($ PSM)'),
         sliders = list(list(
           active = 0,
           currentvalue = list(prefix = "Quarter: "),
           steps = lapply(unique(data_cleaned$Quarter), function(q) {
             list(label = q, method = "restyle", args = list("transforms[0].value", q))
           })
         )))

P3_interactive

Step 2: Type of Sale

Failing to categorize data by “Type of Sale” in an analysis of real estate transactions introduces significant limitations that can obscure crucial insights into different market segments. Without this categorization, the analysis merges various types of transactions, such as new sales, resales, and sub-sales, into a single aggregated dataset. This aggregation can lead to a generalized overview that fails to capture the distinct behaviors and trends associated with each sale type.

Firstly, different sale types often exhibit unique pricing patterns, demand cycles, and buyer preferences. For instance, new sales might be influenced by developer promotions and economic incentives, while resales are impacted more by the existing housing market conditions. Sub-sales, involving properties sold before their construction completion, might fluctuate based on speculative market sentiments. Without distinguishing these types, strategic decision-making becomes challenging as the nuanced dynamics of the market are not adequately represented.

Moreover, policies and marketing strategies tailored to specific sale types cannot be effectively formulated or implemented without a clear understanding of the particular characteristics and needs of each segment. For example, marketing strategies that are effective for new developments might not work for resales. Therefore, by not categorizing data by “Type of Sale,” the analysis loses the potential to guide targeted interventions and optimize resource allocation, potentially leading to less effective strategies and missed opportunities in the market.

To enhance our analysis, I’ve processed the data to categorize it by different “Type of Sale.” This categorization allows us to delve into the specific characteristics and trends of new sales, resales, and sub-sales separately.

P3_interactive <- plot_ly(data = data_cleaned, x = ~`Area (SQM)`, y = ~`Unit Price ($ PSM)`, type = 'scatter', mode = 'markers',
                          hoverinfo = 'text',
                          transforms = list(
                            list(
                              type = 'filter',
                              target = ~Quarter,
                              operation = '=',
                              value = unique(data_cleaned$Quarter)[1]  
                            ),
                            list(
                              type = 'filter',
                              target = ~`Type of Sale`,
                              operation = '=',
                              value = unique(data_cleaned$`Type of Sale`)[1] 
                            )
                          ),
                          text = ~paste("Price: ", `Unit Price ($ PSM)`, "$/sqm<br>Area: ", `Area (SQM)`, "sqm<br>Quarter: ", Quarter, "<br>Type of Sale: ", `Type of Sale`)) %>%
  layout(title = 'Interactive Scatter Plot of Unit Price vs. Area by Category',
         xaxis = list(title = 'Area (SQM)'),
         yaxis = list(title = 'Unit Price ($ PSM)'),
         updatemenus = list(
           list(
             type = "dropdown",
             direction = "down",
             showactive = TRUE,
             buttons = lapply(unique(data_cleaned$`Type of Sale`), function(type) {
               list(
                 method = "restyle",
                 args = list("transforms[1].value", type),  
                 label = type
               )
             })
           )
         ),
         sliders = list(list(
           active = 0,
           currentvalue = list(prefix = "Quarter: "),
           steps = lapply(unique(data_cleaned$Quarter), function(q) {
             list(
               label = q,
               method = "restyle",
               args = list("transforms[0].value", q)  
             )
           })
         )))

P3_interactive